{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyalink.alink import *\n",
    "useLocalEnv(1)\n",
    "\n",
    "from utils import *\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "DATA_DIR = ROOT_DIR + \"father_son\" + os.sep\n",
    "\n",
    "ORIGIN_FILE = \"Pearson.txt\";\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "source = CsvSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + ORIGIN_FILE)\\\n",
    "    .setSchemaStr(\"father double, son double\")\\\n",
    "    .setFieldDelimiter(\"\\t\")\\\n",
    "    .setIgnoreFirstLine(True);\n",
    "\n",
    "source.firstN(5).print();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_source = source.collectToDataframe()\n",
    "plt.figure(figsize=(8, 8))\n",
    "plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "source.lazyPrintStatistics();\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_plus_one = source\\\n",
    "    .select(\"father, son, father+1 AS plus_one\")\\\n",
    "    .collectToDataframe()\n",
    "plt.figure(figsize=(8, 8))\n",
    "plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)\n",
    "plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "source.filter(\"father>=71.5 AND father<72.5\").lazyPrintStatistics(\"father 72\");\n",
    "\n",
    "source.filter(\"father>=64.5 AND father<65.5\").lazyPrintStatistics(\"father 65\");\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "linear_model = LinearRegTrainBatchOp()\\\n",
    "    .setFeatureCols([\"father\"])\\\n",
    "    .setLabelCol(\"son\")\\\n",
    "    .linkFrom(source);\n",
    "\n",
    "linear_model.lazyPrintTrainInfo();\n",
    "linear_model.lazyPrintModelInfo();\n",
    "\n",
    "linear_reg = LinearRegPredictBatchOp()\\\n",
    "    .setPredictionCol(\"linear_reg\")\\\n",
    "    .linkFrom(linear_model, source);\n",
    "\n",
    "linear_reg.lazyPrint(5);\n",
    "\n",
    "BatchOperator.execute();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_linear_reg = linear_reg.collectToDataframe()\n",
    "\n",
    "plt.figure(figsize=(8, 8))\n",
    "plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)\n",
    "plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2)\n",
    "plt.plot(df_linear_reg['father'], df_linear_reg['linear_reg'], color='red', linewidth=2)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
